!pip install imblearn
Requirement already satisfied: imblearn in c:\users\ganesh\anaconda3\lib\site-packages (0.0) Requirement already satisfied: imbalanced-learn in c:\users\ganesh\anaconda3\lib\site-packages (from imblearn) (0.8.1) Requirement already satisfied: scipy>=0.19.1 in c:\users\ganesh\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.6.2) Requirement already satisfied: numpy>=1.13.3 in c:\users\ganesh\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.20.1) Requirement already satisfied: joblib>=0.11 in c:\users\ganesh\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.0.1) Requirement already satisfied: scikit-learn>=0.24 in c:\users\ganesh\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.24.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\ganesh\anaconda3\lib\site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0)
### IMPORT: ------------------------------------
import scipy.stats as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import statsmodels.api as sm
#--Sklearn library--
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score # Sklearn package's randomized data splitting function
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
StackingClassifier
)
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
# To impute missing values
from sklearn.impute import KNNImputer
# Libtune to tune model, get different metric scores
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score,f1_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,plot_confusion_matrix #to plot confusion matric
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_colwidth',200)
# To supress numerical display in scientific notations
pd.set_option('display.float_format', lambda x: '%.5f' % x)
warnings.filterwarnings('ignore') # To supress warnings
# set the background for the graphs
plt.style.use('ggplot')
# For pandas profiling
# from pandas_profiling import ProfileReport
# print('Load Libraries-Done')
# !pip install -U pandas-profiling
df=pd.read_csv("BankChurners.csv")
df_credit=df.copy()
print(f'There are {df_credit.shape[0]} rows and {df_credit.shape[1]} columns') # fstring
There are 10127 rows and 21 columns
# View the first 5 rows of the dataset.
df_credit.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.00000 | 777 | 11914.00000 | 1.33500 | 1144 | 42 | 1.62500 | 0.06100 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.00000 | 864 | 7392.00000 | 1.54100 | 1291 | 33 | 3.71400 | 0.10500 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.00000 | 0 | 3418.00000 | 2.59400 | 1887 | 20 | 2.33300 | 0.00000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.00000 | 2517 | 796.00000 | 1.40500 | 1171 | 20 | 2.33300 | 0.76000 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.00000 | 0 | 4716.00000 | 2.17500 | 816 | 28 | 2.50000 | 0.00000 |
# last 5 rows
df_credit.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.00000 | 1851 | 2152.00000 | 0.70300 | 15476 | 117 | 0.85700 | 0.46200 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.00000 | 2186 | 2091.00000 | 0.80400 | 8764 | 69 | 0.68300 | 0.51100 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.00000 | 0 | 5409.00000 | 0.81900 | 10291 | 60 | 0.81800 | 0.00000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.00000 | 0 | 5281.00000 | 0.53500 | 8395 | 62 | 0.72200 | 0.00000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.00000 | 1961 | 8427.00000 | 0.70300 | 10294 | 61 | 0.64900 | 0.18900 |
#Understand the dataset.
#get the size of dataframe
print ("Rows : " , df_credit.shape[0]) #get number of rows/observations
print ("Columns : " , df_credit.shape[1]) #get number of columns
print ("#"*40,"\n","Features : \n\n", df_credit.columns.tolist()) #get name of columns/features
missing_df = pd.DataFrame({
"Missing": df_credit.isnull().sum(),
"Missing %": round((df_credit.isnull().sum()/ df_credit.isna().count()*100), 2)
})
display(missing_df.sort_values(by='Missing', ascending=False))
Rows : 10127 Columns : 21 ######################################## Features : ['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
| Missing | Missing % | |
|---|---|---|
| Education_Level | 1519 | 15.00000 |
| Marital_Status | 749 | 7.40000 |
| CLIENTNUM | 0 | 0.00000 |
| Contacts_Count_12_mon | 0 | 0.00000 |
| Total_Ct_Chng_Q4_Q1 | 0 | 0.00000 |
| Total_Trans_Ct | 0 | 0.00000 |
| Total_Trans_Amt | 0 | 0.00000 |
| Total_Amt_Chng_Q4_Q1 | 0 | 0.00000 |
| Avg_Open_To_Buy | 0 | 0.00000 |
| Total_Revolving_Bal | 0 | 0.00000 |
| Credit_Limit | 0 | 0.00000 |
| Total_Relationship_Count | 0 | 0.00000 |
| Months_Inactive_12_mon | 0 | 0.00000 |
| Attrition_Flag | 0 | 0.00000 |
| Months_on_book | 0 | 0.00000 |
| Card_Category | 0 | 0.00000 |
| Income_Category | 0 | 0.00000 |
| Dependent_count | 0 | 0.00000 |
| Gender | 0 | 0.00000 |
| Customer_Age | 0 | 0.00000 |
| Avg_Utilization_Ratio | 0 | 0.00000 |
# Summary of the dataset.
# Replace NAN values with most occured category in actual vairable
# DataFrame[ColName].fillna(Mode_Category,inplace=True)
df_credit["Education_Level"].fillna("Unknown",inplace=True)
df_credit["Marital_Status"].fillna("Unknown",inplace=True)
#### Check the data types of the columns for the dataset.
df_credit.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
df_credit.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.00000 | 739177606.33366 | 36903783.45023 | 708082083.00000 | 713036770.50000 | 717926358.00000 | 773143533.00000 | 828343083.00000 |
| Customer_Age | 10127.00000 | 46.32596 | 8.01681 | 26.00000 | 41.00000 | 46.00000 | 52.00000 | 73.00000 |
| Dependent_count | 10127.00000 | 2.34620 | 1.29891 | 0.00000 | 1.00000 | 2.00000 | 3.00000 | 5.00000 |
| Months_on_book | 10127.00000 | 35.92841 | 7.98642 | 13.00000 | 31.00000 | 36.00000 | 40.00000 | 56.00000 |
| Total_Relationship_Count | 10127.00000 | 3.81258 | 1.55441 | 1.00000 | 3.00000 | 4.00000 | 5.00000 | 6.00000 |
| Months_Inactive_12_mon | 10127.00000 | 2.34117 | 1.01062 | 0.00000 | 2.00000 | 2.00000 | 3.00000 | 6.00000 |
| Contacts_Count_12_mon | 10127.00000 | 2.45532 | 1.10623 | 0.00000 | 2.00000 | 2.00000 | 3.00000 | 6.00000 |
| Credit_Limit | 10127.00000 | 8631.95370 | 9088.77665 | 1438.30000 | 2555.00000 | 4549.00000 | 11067.50000 | 34516.00000 |
| Total_Revolving_Bal | 10127.00000 | 1162.81406 | 814.98734 | 0.00000 | 359.00000 | 1276.00000 | 1784.00000 | 2517.00000 |
| Avg_Open_To_Buy | 10127.00000 | 7469.13964 | 9090.68532 | 3.00000 | 1324.50000 | 3474.00000 | 9859.00000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 10127.00000 | 0.75994 | 0.21921 | 0.00000 | 0.63100 | 0.73600 | 0.85900 | 3.39700 |
| Total_Trans_Amt | 10127.00000 | 4404.08630 | 3397.12925 | 510.00000 | 2155.50000 | 3899.00000 | 4741.00000 | 18484.00000 |
| Total_Trans_Ct | 10127.00000 | 64.85869 | 23.47257 | 10.00000 | 45.00000 | 67.00000 | 81.00000 | 139.00000 |
| Total_Ct_Chng_Q4_Q1 | 10127.00000 | 0.71222 | 0.23809 | 0.00000 | 0.58200 | 0.70200 | 0.81800 | 3.71400 |
| Avg_Utilization_Ratio | 10127.00000 | 0.27489 | 0.27569 | 0.00000 | 0.02300 | 0.17600 | 0.50300 | 0.99900 |
# Droping CLIENTNUM
df_credit.drop(['CLIENTNUM'],axis=1,inplace=True)
cat_cols = ['Attrition_Flag','Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category','Dependent_count','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon']
for col in cat_cols:
print(f"Feature: {col}")
print("-"*40)
display(pd.DataFrame({"Counts": df_credit[col].value_counts(dropna=False)}).sort_values(by='Counts', ascending=False))
Feature: Attrition_Flag ----------------------------------------
| Counts | |
|---|---|
| Existing Customer | 8500 |
| Attrited Customer | 1627 |
Feature: Gender ----------------------------------------
| Counts | |
|---|---|
| F | 5358 |
| M | 4769 |
Feature: Education_Level ----------------------------------------
| Counts | |
|---|---|
| Graduate | 3128 |
| High School | 2013 |
| Unknown | 1519 |
| Uneducated | 1487 |
| College | 1013 |
| Post-Graduate | 516 |
| Doctorate | 451 |
Feature: Marital_Status ----------------------------------------
| Counts | |
|---|---|
| Married | 4687 |
| Single | 3943 |
| Unknown | 749 |
| Divorced | 748 |
Feature: Income_Category ----------------------------------------
| Counts | |
|---|---|
| Less than $40K | 3561 |
| $40K - $60K | 1790 |
| $80K - $120K | 1535 |
| $60K - $80K | 1402 |
| abc | 1112 |
| $120K + | 727 |
Feature: Card_Category ----------------------------------------
| Counts | |
|---|---|
| Blue | 9436 |
| Silver | 555 |
| Gold | 116 |
| Platinum | 20 |
Feature: Dependent_count ----------------------------------------
| Counts | |
|---|---|
| 3 | 2732 |
| 2 | 2655 |
| 1 | 1838 |
| 4 | 1574 |
| 0 | 904 |
| 5 | 424 |
Feature: Total_Relationship_Count ----------------------------------------
| Counts | |
|---|---|
| 3 | 2305 |
| 4 | 1912 |
| 5 | 1891 |
| 6 | 1866 |
| 2 | 1243 |
| 1 | 910 |
Feature: Months_Inactive_12_mon ----------------------------------------
| Counts | |
|---|---|
| 3 | 3846 |
| 2 | 3282 |
| 1 | 2233 |
| 4 | 435 |
| 5 | 178 |
| 6 | 124 |
| 0 | 29 |
Feature: Contacts_Count_12_mon ----------------------------------------
| Counts | |
|---|---|
| 3 | 3380 |
| 2 | 3227 |
| 1 | 1499 |
| 4 | 1392 |
| 0 | 399 |
| 5 | 176 |
| 6 | 54 |
## Converting the data type of categorical features to 'category'
cat_cols = ['Attrition_Flag','Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category','Dependent_count','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon']
df_credit[cat_cols] = df_credit[cat_cols].astype('category')
df_credit.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 10127 non-null category 5 Marital_Status 10127 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(10), float64(5), int64(5) memory usage: 892.5 KB
df_credit.describe(include=['category']).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Dependent_count | 10127 | 6 | 3 | 2732 |
| Education_Level | 10127 | 7 | Graduate | 3128 |
| Marital_Status | 10127 | 4 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
| Total_Relationship_Count | 10127 | 6 | 3 | 2305 |
| Months_Inactive_12_mon | 10127 | 7 | 3 | 3846 |
| Contacts_Count_12_mon | 10127 | 7 | 3 | 3380 |
Age
df_credit.Customer_Age.describe()
count 10127.00000 mean 46.32596 std 8.01681 min 26.00000 25% 41.00000 50% 46.00000 75% 52.00000 max 73.00000 Name: Customer_Age, dtype: float64
df_credit['Agebin'] = pd.cut(df_credit['Customer_Age'], bins = [25, 35,45,55,65, 75], labels = ['25-35', '36-45', '46-55', '56-65','66-75'])
df_credit.Agebin.value_counts()
46-55 4135 36-45 3742 56-65 1321 25-35 919 66-75 10 Name: Agebin, dtype: int64
def dist_box(data):
# function plots a combined graph for univariate analysis of continous variable
#to check spread, central tendency , dispersion and outliers
Name=data.name.upper()
fig,(ax_box,ax_dis) =plt.subplots(nrows=2,sharex=True,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
mean=data.mean()
median=data.median()
mode=data.mode().tolist()[0]
sns.set_theme(style="white")
fig.suptitle("SPREAD OF DATA FOR "+ Name , fontsize=18, fontweight='bold')
sns.boxplot(x=data,showmeans=True, orient='h',color="tan",ax=ax_box)
ax_box.set(xlabel='')
# just trying to make visualisation better. This will set background to white
sns.despine(top=True,right=True,left=True) # to remove side line from graph
sns.distplot(data,kde=False,color='red',ax=ax_dis)
ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
plt.legend({'Mean':mean,'Median':median})
#select all quantitative columns for checking the spread
list_col= df_credit.select_dtypes(include='number').columns.to_list()
for i in range(len(list_col)):
dist_box(df_credit[list_col[i]])
Observations
# Making a list of all categorical variables
plt.figure(figsize=(15,20))
sns.set_theme(style="white")
for i, variable in enumerate(cat_cols):
plt.subplot(9,2,i+1)
order = df_credit[variable].value_counts(ascending=False).index
#sns.set_palette(list_palette[i]) # to set the palette
sns.set_palette('twilight_shifted')
ax=sns.countplot(x=df_credit[variable], data=df_credit )
sns.despine(top=True,right=True,left=True) # to remove side line from graph
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/len(df_credit[variable]))
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height()
plt.annotate(percentage, (x, y),ha='center')
plt.tight_layout()
plt.title(cat_cols[i].upper())
Observations
sns.set_palette(sns.color_palette("Set2", 8))
plt.figure(figsize=(15,12))
sns.heatmap(df_credit.corr(),annot=True)
plt.show()
sns.set_palette(sns.color_palette("Set1", 8))
sns.pairplot(df_credit, hue="Attrition_Flag",corner=True)
plt.show()
Observations
### Function to plot distributions and Boxplots of customers
def plot(x,target='Attrition_Flag'):
fig,axs = plt.subplots(2,2,figsize=(12,10))
axs[0, 0].set_title(f'Distribution of {x} \n of a existing customer',fontsize=12,fontweight='bold')
sns.distplot(df_credit[(df_credit[target] == 'Existing Customer')][x],ax=axs[0,0],color='teal')
axs[0, 1].set_title(f"Distribution of {x}\n of a attrited customer ",fontsize=12,fontweight='bold')
sns.distplot(df_credit[(df_credit[target] == 'Attrited Customer')][x],ax=axs[0,1],color='orange')
axs[1,0].set_title(f'Boxplot of {x} w.r.t attrited customer',fontsize=12,fontweight='bold')
line = plt.Line2D((.1,.9),(.5,.5), color='grey', linewidth=1.5,linestyle='--')
fig.add_artist(line)
sns.boxplot(df_credit[target],df_credit[x],ax=axs[1,0],palette='gist_rainbow',showmeans=True)
axs[1,1].set_title(f'Boxplot of {x} w.r.t Attrited customer - Without outliers',fontsize=12,fontweight='bold')
sns.boxplot(df_credit[target],df_credit[x],ax=axs[1,1],showfliers=False,palette='gist_rainbow',showmeans=True) #turning off outliers from boxplot
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.tight_layout(pad=4)
plt.show()
# select all quantitative columns for checking the spread
#list_col= ['Age','DurationOfPitch','MonthlyIncome']
list_col=df_credit.select_dtypes(include='number').columns.to_list()
#print(list_col)
#plt.figure(figsize=(14,23))
for j in range(len(list_col)):
plot(list_col[j])
Observation
plt.figure(figsize=(10,5))
sns.set_palette(sns.color_palette("tab20", 8))
sns.barplot(y='Credit_Limit',x='Income_Category',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('Income vs credit')
Text(0.5, 1.0, 'Income vs credit')
# Making a list of all categorical variables
plt.figure(figsize=(15,20))
sns.set_theme(style="white")
for i, variable in enumerate(cat_cols):
plt.subplot(9,2,i+1)
order = df_credit[variable].value_counts(ascending=False).index
#sns.set_palette(list_palette[i]) # to set the palette
sns.set_palette('twilight_shifted')
ax=sns.countplot(x=df_credit[variable], data=df_credit )
sns.despine(top=True,right=True,left=True) # to remove side line from graph
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/len(df_credit[variable]))
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height()
plt.annotate(percentage, (x, y),ha='center')
plt.tight_layout()
plt.title(cat_cols[i].upper())
Observations
sns.set_palette(sns.color_palette("Set2", 8))
plt.figure(figsize=(15,12))
sns.heatmap(df_credit.corr(),annot=True)
plt.show()
sns.set_palette(sns.color_palette("Set1", 8))
sns.pairplot(df_credit, hue="Attrition_Flag",corner=True)
plt.show()
plt.figure(figsize=(10,5))
sns.set_palette(sns.color_palette("tab20", 9))
sns.barplot(y='Credit_Limit',x='Education_Level',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('CustomerAge vs Education')
Text(0.5, 1.0, 'CustomerAge vs Education')
plt.figure(figsize=(10,5))
sns.set_palette(sns.color_palette("tab20", 9))
sns.barplot(x='Agebin',y='Credit_Limit',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('CustomerAge vs Credit limit')
Text(0.5, 1.0, 'CustomerAge vs Credit limit')
plt.figure(figsize=(10,5))
sns.set_palette(sns.color_palette("tab20", 9))
sns.barplot(x='Agebin',y='Total_Revolving_Bal',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('CustomerAge vs Total Revolving Balance')
Text(0.5, 1.0, 'CustomerAge vs Total Revolving Balance')
plt.figure(figsize=(10,5))
sns.set_palette(sns.color_palette("tab20", 9))
sns.barplot(x='Agebin',y='Total_Trans_Amt',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('CustomerAge vs Total Transcational Amount')
Text(0.5, 1.0, 'CustomerAge vs Total Transcational Amount')
plt.figure(figsize=(10,5))
sns.barplot(y='Credit_Limit',x='Gender',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('Credit limit vs Gender')
Text(0.5, 1.0, 'Credit limit vs Gender')
plt.figure(figsize=(10,5))
sns.barplot(y='Credit_Limit',x='Card_Category',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('Credit Limit vs Card Category')
Text(0.5, 1.0, 'Credit Limit vs Card Category')
plt.figure(figsize=(10,5))
sns.barplot(y='Total_Trans_Amt',x='Card_Category',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('Total Transcation Amount vs Card')
Text(0.5, 1.0, 'Total Transcation Amount vs Card')
plt.figure(figsize=(10,5))
sns.barplot(y='Total_Trans_Ct',x='Card_Category',hue='Attrition_Flag',data=df_credit)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1.00, 1))
plt.title('Total Transcation Count vs Card Category')
Text(0.5, 1.0, 'Total Transcation Count vs Card Category')
Observation
## Function to plot stacked bar chart
def stacked_plot(x):
sns.set_palette(sns.color_palette("tab20", 8))
tab1 = pd.crosstab(x,df_credit['Attrition_Flag'],margins=True)
display(tab1)
tab = pd.crosstab(x,df_credit['Attrition_Flag'],normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(9,5))
plt.xticks(rotation=360)
#labels=["No","Yes"]
plt.legend(loc='lower left', frameon=False,)
plt.legend(loc="upper left",title=" ",bbox_to_anchor=(1,1))
sns.despine(top=True,right=True,left=True) # to remove side line from graph
#plt.legend(labels)
plt.show()
cat_cols.append("Agebin")
for i, variable in enumerate(cat_cols):
stacked_plot(df_credit[variable])
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Attrition_Flag | |||
| Attrited Customer | 1627 | 0 | 1627 |
| Existing Customer | 0 | 8500 | 8500 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Gender | |||
| F | 930 | 4428 | 5358 |
| M | 697 | 4072 | 4769 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Education_Level | |||
| College | 154 | 859 | 1013 |
| Doctorate | 95 | 356 | 451 |
| Graduate | 487 | 2641 | 3128 |
| High School | 306 | 1707 | 2013 |
| Post-Graduate | 92 | 424 | 516 |
| Uneducated | 237 | 1250 | 1487 |
| Unknown | 256 | 1263 | 1519 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Marital_Status | |||
| Divorced | 121 | 627 | 748 |
| Married | 709 | 3978 | 4687 |
| Single | 668 | 3275 | 3943 |
| Unknown | 129 | 620 | 749 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Income_Category | |||
| $120K + | 126 | 601 | 727 |
| $40K - $60K | 271 | 1519 | 1790 |
| $60K - $80K | 189 | 1213 | 1402 |
| $80K - $120K | 242 | 1293 | 1535 |
| Less than $40K | 612 | 2949 | 3561 |
| abc | 187 | 925 | 1112 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Card_Category | |||
| Blue | 1519 | 7917 | 9436 |
| Gold | 21 | 95 | 116 |
| Platinum | 5 | 15 | 20 |
| Silver | 82 | 473 | 555 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Dependent_count | |||
| 0 | 135 | 769 | 904 |
| 1 | 269 | 1569 | 1838 |
| 2 | 417 | 2238 | 2655 |
| 3 | 482 | 2250 | 2732 |
| 4 | 260 | 1314 | 1574 |
| 5 | 64 | 360 | 424 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Total_Relationship_Count | |||
| 1 | 233 | 677 | 910 |
| 2 | 346 | 897 | 1243 |
| 3 | 400 | 1905 | 2305 |
| 4 | 225 | 1687 | 1912 |
| 5 | 227 | 1664 | 1891 |
| 6 | 196 | 1670 | 1866 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Months_Inactive_12_mon | |||
| 0 | 15 | 14 | 29 |
| 1 | 100 | 2133 | 2233 |
| 2 | 505 | 2777 | 3282 |
| 3 | 826 | 3020 | 3846 |
| 4 | 130 | 305 | 435 |
| 5 | 32 | 146 | 178 |
| 6 | 19 | 105 | 124 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Contacts_Count_12_mon | |||
| 0 | 7 | 392 | 399 |
| 1 | 108 | 1391 | 1499 |
| 2 | 403 | 2824 | 3227 |
| 3 | 681 | 2699 | 3380 |
| 4 | 315 | 1077 | 1392 |
| 5 | 59 | 117 | 176 |
| 6 | 54 | 0 | 54 |
| All | 1627 | 8500 | 10127 |
| Attrition_Flag | Attrited Customer | Existing Customer | All |
|---|---|---|---|
| Agebin | |||
| 25-35 | 122 | 797 | 919 |
| 36-45 | 606 | 3136 | 3742 |
| 46-55 | 688 | 3447 | 4135 |
| 56-65 | 209 | 1112 | 1321 |
| 66-75 | 2 | 8 | 10 |
| All | 1627 | 8500 | 10127 |
Observations
#Profile of Attrited Customer with Blue Card
df_credit[(df_credit['Card_Category']=='Blue') & (df_credit['Attrition_Flag']=='Attrited Customer')].describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 1519 | 1 | Attrited Customer | 1519 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 1519.00000 | NaN | NaN | NaN | 46.66228 | 7.69818 | 26.00000 | 41.00000 | 47.00000 | 52.00000 | 68.00000 |
| Gender | 1519 | 2 | F | 890 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 1519.00000 | 6.00000 | 3.00000 | 456.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 1519 | 7 | Graduate | 450 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 1519 | 4 | Married | 674 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 1519 | 6 | Less than $40K | 586 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 1519 | 1 | Blue | 1519 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 1519.00000 | NaN | NaN | NaN | 36.20013 | 7.82959 | 13.00000 | 32.00000 | 36.00000 | 40.00000 | 56.00000 |
| Total_Relationship_Count | 1519.00000 | 6.00000 | 3.00000 | 386.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 1519.00000 | 7.00000 | 3.00000 | 769.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Contacts_Count_12_mon | 1519.00000 | 7.00000 | 3.00000 | 634.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Limit | 1519.00000 | NaN | NaN | NaN | 6817.74733 | 7470.18771 | 1438.30000 | 2004.50000 | 3841.00000 | 8313.50000 | 34516.00000 |
| Total_Revolving_Bal | 1519.00000 | NaN | NaN | NaN | 669.03555 | 922.33325 | 0.00000 | 0.00000 | 0.00000 | 1303.50000 | 2517.00000 |
| Avg_Open_To_Buy | 1519.00000 | NaN | NaN | NaN | 6148.71178 | 7500.48659 | 3.00000 | 1520.50000 | 3157.00000 | 7571.50000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 1519.00000 | NaN | NaN | NaN | 0.69230 | 0.21256 | 0.00000 | 0.54400 | 0.69800 | 0.85350 | 1.41100 |
| Total_Trans_Amt | 1519.00000 | NaN | NaN | NaN | 2954.16458 | 2180.36497 | 510.00000 | 1896.00000 | 2314.00000 | 2709.00000 | 10583.00000 |
| Total_Trans_Ct | 1519.00000 | NaN | NaN | NaN | 44.22646 | 14.13832 | 10.00000 | 37.00000 | 43.00000 | 50.50000 | 91.00000 |
| Total_Ct_Chng_Q4_Q1 | 1519.00000 | NaN | NaN | NaN | 0.55036 | 0.22574 | 0.00000 | 0.40000 | 0.52400 | 0.68700 | 2.50000 |
| Avg_Utilization_Ratio | 1519.00000 | NaN | NaN | NaN | 0.17174 | 0.27103 | 0.00000 | 0.00000 | 0.00000 | 0.26150 | 0.99900 |
| Agebin | 1519 | 5 | 46-55 | 634 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Profile of Attrited Customer with gold Card
df_credit[(df_credit['Card_Category']=='Gold') & (df_credit['Attrition_Flag']=='Attrited Customer')].describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 21 | 1 | Attrited Customer | 21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 21.00000 | NaN | NaN | NaN | 43.85714 | 6.22323 | 32.00000 | 41.00000 | 44.00000 | 47.00000 | 59.00000 |
| Gender | 21 | 2 | M | 13 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 21.00000 | 6.00000 | 2.00000 | 7.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 21 | 7 | Graduate | 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 21 | 3 | Single | 11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 21 | 6 | $60K - $80K | 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 21 | 1 | Gold | 21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 21.00000 | NaN | NaN | NaN | 33.90476 | 6.15552 | 20.00000 | 32.00000 | 36.00000 | 36.00000 | 48.00000 |
| Total_Relationship_Count | 21.00000 | 6.00000 | 2.00000 | 8.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 21.00000 | 3.00000 | 3.00000 | 14.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Contacts_Count_12_mon | 21.00000 | 6.00000 | 3.00000 | 8.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Limit | 21.00000 | NaN | NaN | NaN | 29878.52381 | 7832.30742 | 15109.00000 | 23981.00000 | 34516.00000 | 34516.00000 | 34516.00000 |
| Total_Revolving_Bal | 21.00000 | NaN | NaN | NaN | 1027.00000 | 1009.46223 | 0.00000 | 0.00000 | 897.00000 | 1847.00000 | 2517.00000 |
| Avg_Open_To_Buy | 21.00000 | NaN | NaN | NaN | 28851.52381 | 7757.52462 | 13640.00000 | 23981.00000 | 32315.00000 | 34516.00000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 21.00000 | NaN | NaN | NaN | 0.76990 | 0.25588 | 0.19600 | 0.59100 | 0.85800 | 0.97800 | 1.04700 |
| Total_Trans_Amt | 21.00000 | NaN | NaN | NaN | 5841.80952 | 2836.81313 | 1727.00000 | 2315.00000 | 6782.00000 | 8356.00000 | 9338.00000 |
| Total_Trans_Ct | 21.00000 | NaN | NaN | NaN | 59.85714 | 14.40238 | 34.00000 | 47.00000 | 64.00000 | 68.00000 | 89.00000 |
| Total_Ct_Chng_Q4_Q1 | 21.00000 | NaN | NaN | NaN | 0.61905 | 0.20050 | 0.21400 | 0.47800 | 0.65000 | 0.75000 | 0.97800 |
| Avg_Utilization_Ratio | 21.00000 | NaN | NaN | NaN | 0.03629 | 0.03545 | 0.00000 | 0.00000 | 0.03200 | 0.06400 | 0.11900 |
| Agebin | 21 | 4 | 36-45 | 9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Profile of Attrited Customer with silver Card
df_credit[(df_credit['Card_Category']=='Silver') & (df_credit['Attrition_Flag']=='Attrited Customer')].describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 82 | 1 | Attrited Customer | 82 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 82.00000 | NaN | NaN | NaN | 47.15854 | 7.47088 | 30.00000 | 42.25000 | 48.00000 | 52.00000 | 65.00000 |
| Gender | 82 | 2 | M | 54 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 82.00000 | 6.00000 | 2.00000 | 23.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 82 | 7 | Graduate | 28 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 82 | 4 | Single | 43 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 82 | 6 | $80K - $120K | 22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 82 | 1 | Silver | 82 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 82.00000 | NaN | NaN | NaN | 36.36585 | 7.72124 | 18.00000 | 33.00000 | 36.00000 | 42.75000 | 56.00000 |
| Total_Relationship_Count | 82.00000 | 6.00000 | 2.00000 | 23.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 82.00000 | 6.00000 | 3.00000 | 40.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Contacts_Count_12_mon | 82.00000 | 6.00000 | 3.00000 | 37.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Limit | 82.00000 | NaN | NaN | NaN | 25960.26829 | 10054.77810 | 3735.00000 | 14890.00000 | 33092.00000 | 34516.00000 | 34516.00000 |
| Total_Revolving_Bal | 82.00000 | NaN | NaN | NaN | 677.24390 | 894.66458 | 0.00000 | 0.00000 | 184.00000 | 1267.75000 | 2517.00000 |
| Avg_Open_To_Buy | 82.00000 | NaN | NaN | NaN | 25283.02439 | 10072.89178 | 3735.00000 | 14023.75000 | 31999.00000 | 34116.75000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 82.00000 | NaN | NaN | NaN | 0.70591 | 0.24399 | 0.00000 | 0.55225 | 0.73350 | 0.87675 | 1.49200 |
| Total_Trans_Amt | 82.00000 | NaN | NaN | NaN | 4899.68293 | 3140.82380 | 691.00000 | 1929.50000 | 4753.00000 | 8144.00000 | 10294.00000 |
| Total_Trans_Ct | 82.00000 | NaN | NaN | NaN | 53.28049 | 17.49728 | 14.00000 | 41.00000 | 50.50000 | 68.75000 | 94.00000 |
| Total_Ct_Chng_Q4_Q1 | 82.00000 | NaN | NaN | NaN | 0.61301 | 0.24940 | 0.00000 | 0.43475 | 0.59100 | 0.78325 | 1.21100 |
| Avg_Utilization_Ratio | 82.00000 | NaN | NaN | NaN | 0.03244 | 0.05179 | 0.00000 | 0.00000 | 0.00500 | 0.05350 | 0.23400 |
| Agebin | 82 | 4 | 46-55 | 42 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Profile of Attrited Customer with platinum Card
df_credit[(df_credit['Card_Category']=='Platinum') & (df_credit['Attrition_Flag']=='Attrited Customer')].describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 5 | 1 | Attrited Customer | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 5.00000 | NaN | NaN | NaN | 49.40000 | 4.15933 | 43.00000 | 48.00000 | 51.00000 | 51.00000 | 54.00000 |
| Gender | 5 | 2 | F | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 5.00000 | 4.00000 | 2.00000 | 2.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 5 | 3 | Graduate | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 5 | 2 | Single | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 5 | 3 | Less than $40K | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 5 | 1 | Platinum | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 5.00000 | NaN | NaN | NaN | 36.00000 | 4.52769 | 31.00000 | 32.00000 | 37.00000 | 38.00000 | 42.00000 |
| Total_Relationship_Count | 5.00000 | 3.00000 | 2.00000 | 3.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 5.00000 | 2.00000 | 3.00000 | 3.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Contacts_Count_12_mon | 5.00000 | 3.00000 | 3.00000 | 2.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Credit_Limit | 5.00000 | NaN | NaN | NaN | 24997.40000 | 9281.90682 | 15987.00000 | 15987.00000 | 23981.00000 | 34516.00000 | 34516.00000 |
| Total_Revolving_Bal | 5.00000 | NaN | NaN | NaN | 263.40000 | 284.58795 | 0.00000 | 0.00000 | 193.00000 | 531.00000 | 593.00000 |
| Avg_Open_To_Buy | 5.00000 | NaN | NaN | NaN | 24734.00000 | 9212.95080 | 15794.00000 | 15987.00000 | 23388.00000 | 33985.00000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 5.00000 | NaN | NaN | NaN | 0.78480 | 0.22978 | 0.43500 | 0.69500 | 0.82700 | 0.98000 | 0.98700 |
| Total_Trans_Amt | 5.00000 | NaN | NaN | NaN | 4755.80000 | 2073.95292 | 2021.00000 | 3901.00000 | 4758.00000 | 5418.00000 | 7681.00000 |
| Total_Trans_Ct | 5.00000 | NaN | NaN | NaN | 60.20000 | 10.03494 | 46.00000 | 54.00000 | 65.00000 | 65.00000 | 71.00000 |
| Total_Ct_Chng_Q4_Q1 | 5.00000 | NaN | NaN | NaN | 0.54560 | 0.14819 | 0.39400 | 0.42100 | 0.51200 | 0.69000 | 0.71100 |
| Avg_Utilization_Ratio | 5.00000 | NaN | NaN | NaN | 0.01040 | 0.01064 | 0.00000 | 0.00000 | 0.01200 | 0.01500 | 0.02500 |
| Agebin | 5 | 2 | 46-55 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Profile of customer who attrited most based on there card type
Most likely Female who were married , age group 46-55 and earning less than 40 k, Education level graduate and dependent member 3 , total bank product 3 and were inactive for 3 months. There average utilzation ratio was very low.
Gold Card
Most likely Male who are single , between age group 36-45 earning 60- 80k, education level graduate and inactive for 3 months.
Silver Card
Most likely Male who are single , between age group 46-55 , earned between 80 k -120 k ,education level graduate and inactive for 3 months.
Platinum card
Q1 = df_credit.quantile(0.25) #To find the 25th percentile and 75th percentile.
Q3 = df_credit.quantile(0.75)
IQR = Q3 - Q1 #Inter Quantile Range (75th perentile - 25th percentile)
lower=Q1-1.5*IQR #Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper=Q3+1.5*IQR
((df_credit.select_dtypes(include=['float64','int64'])<lower) | (df_credit.select_dtypes(include=['float64','int64'])>upper)).sum()/len(df_credit)*100
Customer_Age 0.01975 Months_on_book 3.81159 Credit_Limit 9.71660 Total_Revolving_Bal 0.00000 Avg_Open_To_Buy 9.50923 Total_Amt_Chng_Q4_Q1 3.91034 Total_Trans_Amt 8.84764 Total_Trans_Ct 0.01975 Total_Ct_Chng_Q4_Q1 3.89059 Avg_Utilization_Ratio 0.00000 dtype: float64
numeric_columns = df_credit.select_dtypes('number').columns.to_list()
# outlier detection using boxplot
plt.figure(figsize=(20,30))
for i, variable in enumerate(numeric_columns):
plt.subplot(4,4,i+1)
plt.boxplot(df_credit[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
print(upper)
Customer_Age 68.50000 Months_on_book 53.50000 Credit_Limit 23836.25000 Total_Revolving_Bal 3921.50000 Avg_Open_To_Buy 22660.75000 Total_Amt_Chng_Q4_Q1 1.20100 Total_Trans_Amt 8619.25000 Total_Trans_Ct 135.00000 Total_Ct_Chng_Q4_Q1 1.17200 Avg_Utilization_Ratio 1.22300 dtype: float64
df_credit[df_credit['Credit_Limit'] > upper.Credit_Limit].sort_values(by='Credit_Limit',ascending=False ).count()
Attrition_Flag 984 Customer_Age 984 Gender 984 Dependent_count 984 Education_Level 984 Marital_Status 984 Income_Category 984 Card_Category 984 Months_on_book 984 Total_Relationship_Count 984 Months_Inactive_12_mon 984 Contacts_Count_12_mon 984 Credit_Limit 984 Total_Revolving_Bal 984 Avg_Open_To_Buy 984 Total_Amt_Chng_Q4_Q1 984 Total_Trans_Amt 984 Total_Trans_Ct 984 Total_Ct_Chng_Q4_Q1 984 Avg_Utilization_Ratio 984 Agebin 984 dtype: int64
df_credit[df_credit['Credit_Limit']== 34516.00000].count() # had seen this number during EDA so verifying
Attrition_Flag 508 Customer_Age 508 Gender 508 Dependent_count 508 Education_Level 508 Marital_Status 508 Income_Category 508 Card_Category 508 Months_on_book 508 Total_Relationship_Count 508 Months_Inactive_12_mon 508 Contacts_Count_12_mon 508 Credit_Limit 508 Total_Revolving_Bal 508 Avg_Open_To_Buy 508 Total_Amt_Chng_Q4_Q1 508 Total_Trans_Amt 508 Total_Trans_Ct 508 Total_Ct_Chng_Q4_Q1 508 Avg_Utilization_Ratio 508 Agebin 508 dtype: int64
df_credit[df_credit['Total_Trans_Amt'] > upper.Total_Trans_Amt].sort_values(by='Total_Trans_Amt',ascending=False ).head(10)
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Agebin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9964 | Existing Customer | 47 | M | 4 | Unknown | Married | $60K - $80K | Blue | 36 | 4 | 5 | 2 | 10585.00000 | 1749 | 8836.00000 | 0.65500 | 18484 | 108 | 0.58800 | 0.16500 | 46-55 |
| 10073 | Existing Customer | 51 | M | 2 | Graduate | Married | $60K - $80K | Blue | 40 | 3 | 3 | 3 | 3750.00000 | 1801 | 1949.00000 | 0.88900 | 17995 | 116 | 0.65700 | 0.48000 | 46-55 |
| 10097 | Existing Customer | 31 | M | 0 | High School | Single | $40K - $60K | Blue | 25 | 3 | 2 | 3 | 4493.00000 | 1388 | 3105.00000 | 0.79500 | 17744 | 104 | 0.76300 | 0.30900 | 25-35 |
| 9601 | Existing Customer | 45 | M | 4 | High School | Single | $60K - $80K | Blue | 35 | 1 | 1 | 3 | 8449.00000 | 2092 | 6357.00000 | 0.70900 | 17634 | 120 | 0.66700 | 0.24800 | 36-45 |
| 9341 | Existing Customer | 48 | M | 2 | High School | Married | Less than $40K | Silver | 36 | 2 | 2 | 2 | 14581.00000 | 2517 | 12064.00000 | 0.77600 | 17628 | 109 | 0.81700 | 0.17300 | 46-55 |
| 10117 | Existing Customer | 57 | M | 2 | Graduate | Married | $80K - $120K | Blue | 40 | 6 | 3 | 4 | 17925.00000 | 1909 | 16016.00000 | 0.71200 | 17498 | 111 | 0.82000 | 0.10600 | 56-65 |
| 10028 | Existing Customer | 36 | F | 1 | Graduate | Single | Less than $40K | Blue | 16 | 6 | 5 | 3 | 6091.00000 | 1184 | 4907.00000 | 0.76600 | 17437 | 113 | 0.76600 | 0.19400 | 36-45 |
| 9643 | Existing Customer | 54 | F | 2 | Graduate | Married | $40K - $60K | Blue | 41 | 3 | 1 | 3 | 7362.00000 | 1176 | 6186.00000 | 0.73500 | 17390 | 130 | 0.68800 | 0.16000 | 46-55 |
| 9712 | Existing Customer | 49 | M | 4 | Post-Graduate | Single | $80K - $120K | Blue | 42 | 3 | 2 | 1 | 30885.00000 | 2018 | 28867.00000 | 0.90400 | 17350 | 115 | 0.62000 | 0.06500 | 46-55 |
| 9645 | Existing Customer | 35 | M | 3 | Post-Graduate | Married | $80K - $120K | Blue | 28 | 3 | 1 | 2 | 4380.00000 | 0 | 4380.00000 | 0.71900 | 17258 | 121 | 0.70400 | 0.00000 | 25-35 |
df_credit[df_credit['Avg_Open_To_Buy'] > upper.Avg_Open_To_Buy].sort_values(by='Avg_Open_To_Buy',ascending=False ).head(10)
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Agebin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10112 | Attrited Customer | 33 | M | 2 | College | Married | $120K + | Gold | 20 | 2 | 1 | 4 | 34516.00000 | 0 | 34516.00000 | 1.00400 | 9338 | 73 | 0.62200 | 0.00000 | 25-35 |
| 9047 | Attrited Customer | 50 | M | 1 | Post-Graduate | Unknown | $80K - $120K | Gold | 36 | 2 | 3 | 2 | 34516.00000 | 0 | 34516.00000 | 1.03200 | 5547 | 75 | 0.74400 | 0.00000 | 46-55 |
| 2196 | Existing Customer | 50 | M | 3 | High School | Married | $120K + | Blue | 40 | 5 | 1 | 4 | 34516.00000 | 0 | 34516.00000 | 0.98600 | 1930 | 36 | 0.44000 | 0.00000 | 46-55 |
| 2201 | Attrited Customer | 55 | F | 2 | College | Single | abc | Silver | 36 | 2 | 3 | 3 | 34516.00000 | 0 | 34516.00000 | 0.39900 | 1353 | 40 | 0.21200 | 0.00000 | 46-55 |
| 9127 | Existing Customer | 56 | F | 3 | Uneducated | Single | abc | Platinum | 46 | 2 | 3 | 2 | 34516.00000 | 0 | 34516.00000 | 0.88700 | 8416 | 93 | 0.63200 | 0.00000 | 56-65 |
| 2341 | Existing Customer | 52 | M | 1 | Unknown | Single | $120K + | Blue | 44 | 6 | 1 | 2 | 34516.00000 | 0 | 34516.00000 | 1.03000 | 2848 | 56 | 0.75000 | 0.00000 | 46-55 |
| 9075 | Existing Customer | 51 | M | 3 | Uneducated | Married | $80K - $120K | Silver | 37 | 1 | 2 | 1 | 34516.00000 | 0 | 34516.00000 | 0.81400 | 8736 | 97 | 0.70200 | 0.00000 | 46-55 |
| 9073 | Existing Customer | 39 | M | 2 | Unknown | Single | $80K - $120K | Gold | 33 | 1 | 3 | 2 | 34516.00000 | 0 | 34516.00000 | 0.72400 | 9179 | 113 | 0.76600 | 0.00000 | 36-45 |
| 9068 | Attrited Customer | 54 | F | 0 | Graduate | Single | abc | Platinum | 38 | 2 | 2 | 2 | 34516.00000 | 0 | 34516.00000 | 0.69500 | 3901 | 54 | 0.42100 | 0.00000 | 46-55 |
| 9027 | Attrited Customer | 44 | M | 4 | Unknown | Married | $120K + | Blue | 36 | 4 | 3 | 3 | 34516.00000 | 0 | 34516.00000 | 1.04300 | 5425 | 60 | 0.87500 | 0.00000 | 36-45 |
df_credit = df_credit.replace({'Unknown': None})
df_credit.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 Agebin 0 dtype: int64
df_credit.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null category 11 Contacts_Count_12_mon 10127 non-null category 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 20 Agebin 10127 non-null category dtypes: category(11), float64(5), int64(5) memory usage: 902.4 KB
# Label Encode categorical variables
attrition = {'Existing Customer':0, 'Attrited Customer':1}
df_credit['Attrition_Flag']=df_credit['Attrition_Flag'].map(attrition)
marital_status = {'Married':1,'Single':2, 'Divorced':3}
df_credit['Marital_Status']=df_credit['Marital_Status'].map(marital_status)
education = {'Uneducated':1,'High School':2, 'Graduate':3, 'College':4, 'Post-Graduate':5, 'Doctorate':6}
df_credit['Education_Level']=df_credit['Education_Level'].map(education)
income = {'Less than $40K':1,'$40K - $60K':2, '$60K - $80K':3, '$80K - $120K':4, '$120K +':5}
df_credit['Income_Category']=df_credit['Income_Category'].map(income)
imputer = KNNImputer(n_neighbors=5)
reqd_col_for_impute = ['Income_Category','Education_Level','Marital_Status']
# Separating target column
X = df_credit.drop(['Agebin','Attrition_Flag','Avg_Open_To_Buy'],axis=1)
#X = pd.get_dummies(X,drop_first=True)
y = df_credit['Attrition_Flag']
# Splitting the data into train and test sets in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y)
X_train.shape, X_test.shape
((7088, 18), (3039, 18))
#Fit and transform the train data
X_train[reqd_col_for_impute]=imputer.fit_transform(X_train[reqd_col_for_impute])
#Transform the test data
X_test[reqd_col_for_impute]=imputer.transform(X_test[reqd_col_for_impute])
#Checking that no column has missing values in train or test sets
print(X_train.isnull().sum())
print('-'*30)
print(X_test.isnull().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x,y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype('category')
X_test[y] = np.round(X_test[y]).map(inv_dict).astype('category')
inverse_mapping(education,'Education_Level')
inverse_mapping(marital_status,'Marital_Status')
inverse_mapping(income,'Income_Category')
X_train=pd.get_dummies(X_train,drop_first=True)
X_test=pd.get_dummies(X_test,drop_first=True)
print(X_train.shape, X_test.shape)
(7088, 46) (3039, 46)
X_train
| Customer_Age | Months_on_book | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Gender_M | Dependent_count_1 | Dependent_count_2 | ... | Months_Inactive_12_mon_1 | Months_Inactive_12_mon_2 | Months_Inactive_12_mon_3 | Months_Inactive_12_mon_4 | Months_Inactive_12_mon_5 | Months_Inactive_12_mon_6 | Contacts_Count_12_mon_1 | Contacts_Count_12_mon_2 | Contacts_Count_12_mon_3 | Contacts_Count_12_mon_4 | Contacts_Count_12_mon_5 | Contacts_Count_12_mon_6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4124 | 50 | 43 | 7985.00000 | 0 | 1.03200 | 3873 | 72 | 0.67400 | 0.00000 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4686 | 50 | 36 | 5444.00000 | 2499 | 0.46800 | 4509 | 80 | 0.66700 | 0.45900 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1276 | 26 | 13 | 1643.00000 | 1101 | 0.71300 | 2152 | 50 | 0.47100 | 0.67000 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 6119 | 65 | 55 | 2022.00000 | 0 | 0.57900 | 4623 | 65 | 0.54800 | 0.00000 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2253 | 46 | 35 | 4930.00000 | 0 | 1.01900 | 3343 | 77 | 0.63800 | 0.00000 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4581 | 50 | 36 | 1438.30000 | 0 | 0.65800 | 2329 | 43 | 0.59300 | 0.00000 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 9338 | 56 | 47 | 7204.00000 | 0 | 0.80300 | 14042 | 113 | 0.76600 | 0.00000 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 9950 | 45 | 36 | 34516.00000 | 0 | 0.73200 | 8603 | 84 | 0.61500 | 0.00000 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1784 | 35 | 29 | 34516.00000 | 1965 | 1.04400 | 2949 | 70 | 1.00000 | 0.05700 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4752 | 41 | 32 | 3189.00000 | 0 | 0.91000 | 4813 | 86 | 0.75500 | 0.00000 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
7088 rows × 46 columns
# # defining empty lists to add train and test results
model_name=[]
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
f1_train = []
f1_test = []
def make_confusion_matrix(y_actual,y_predict,title):
'''Plot confusion matrix'''
fig, ax = plt.subplots(1, 1)
cm = confusion_matrix(y_actual, y_predict, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["No","Yes"])
disp.plot(cmap='Blues',ax=ax)
ax.set_title(title)
plt.tick_params(axis=u'both', which=u'both',length=0)
plt.grid(b=None,axis='both',which='both',visible=False)
plt.show()
def get_metrics_score(model,modelname,X_train_pass,X_test_df_pass,y_train_pass,y_test_pass):
'''
Function to calculate different metric scores of the model - Accuracy, Recall, Precision, and F1 score
model: classifier to predict values of X
train, test: Independent features
train_y,test_y: Dependent variable
threshold: thresold for classifiying the observation as 1
'''
# defining an empty list to store train and test results
score_list=[]
pred_train = model.predict(X_train_pass)
pred_test = model.predict(X_test_df_pass)
pred_train = np.round(pred_train)
pred_test = np.round(pred_test)
train_acc = accuracy_score(y_train_pass,pred_train)
test_acc = accuracy_score(y_test_pass,pred_test)
train_recall = recall_score(y_train_pass,pred_train)
test_recall = recall_score(y_test_pass,pred_test)
train_precision = precision_score(y_train_pass,pred_train)
test_precision = precision_score(y_test_pass,pred_test)
train_f1 = f1_score(y_train_pass,pred_train)
test_f1 = f1_score(y_test_pass,pred_test)
score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision,train_f1,test_f1))
model_name.append(modelname)
acc_train.append(score_list[0])
acc_test.append(score_list[1])
recall_train.append(score_list[2])
recall_test.append(score_list[3])
precision_train.append(score_list[4])
precision_test.append(score_list[5])
f1_train.append(score_list[6])
f1_test.append(score_list[7])
metric_names = ['Train_Accuracy', 'Test_Accuracy', 'Train_Recall', 'Test_Recall','Train_Precision',
'Test_Precision', 'Train_F1-Score', 'Test_F1-Score']
cols = ['Metric', 'Score']
records = [(name, score) for name, score in zip(metric_names, score_list)]
display(pd.DataFrame.from_records(records, columns=cols, index='Metric').T)
# display confusion matrix
make_confusion_matrix(y_train_pass,pred_train,"Confusion Matrix for Train")
make_confusion_matrix(y_test_pass,pred_test,"Confusion Matrix for Test")
return score_list # returning the list with train and test scores
1.Predicting a customer will churn but he does not - Loss of resources.
2.Predicting a customer will not churn the services but he does - Loss of income
Which case is more important?
How to reduce this loss i.e need to reduce False Negatives?
#Initialize model using pipeline
pipe_lr = make_pipeline( StandardScaler(), (LogisticRegression(random_state=1)))
#Fit on train data
pipe_lr.fit(X_train,y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression(random_state=1))])
lr_score=get_metrics_score(pipe_lr,'LogisticRegression',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.91366 | 0.91214 | 0.62687 | 0.63730 | 0.79245 | 0.77556 | 0.70000 | 0.69966 |
Let's evaluate the model performance by using KFold and cross_val_score
#Evaluate the model performance by using KFold and cross_val_score
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
lr_cv_result=cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, scoring=scoring, cv=kfold)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(lr_cv_result)
plt.show()
Handling Imbalanced dataset
This is an Imbalanced dataset .A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary.
One way to solve this problem is to oversample the examples in the minority class. This can be achieved by simply duplicating examples from the minority class in the training dataset prior to fitting a model. This can balance the class distribution but does not provide any additional information to the model.One approach to addressing imbalanced datasets is to oversample the minority class. The simplest approach involves duplicating examples in the minority class, although these examples don’t add any new information to the model. Instead, new examples can be synthesized from the existing examples. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE for short.
Since dataset is imbalanced let try oversampling using SMOTE and see if performance can be improved.
print(f"Before UpSampling, counts of label attrited customer: {sum(y_train==1)}")
print(f"Before UpSampling, counts of label existing customer: {sum(y_train==0)} \n")
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train.ravel())
print(f"After UpSampling, counts of label attrited customer: {sum(y_train_over==1)}")
print(f"After UpSampling, counts of label existing customer: {sum(y_train_over==0)} \n")
print(f'After UpSampling, the shape of train_X: {X_train_over.shape}')
print(f'After UpSampling, the shape of train_y: {y_train_over.shape} \n')
Before UpSampling, counts of label attrited customer: 1139 Before UpSampling, counts of label existing customer: 5949 After UpSampling, counts of label attrited customer: 5949 After UpSampling, counts of label existing customer: 5949 After UpSampling, the shape of train_X: (11898, 46) After UpSampling, the shape of train_y: (11898,)
lr_over = LogisticRegression(solver='liblinear')
lr_over.fit(X_train_over, y_train_over)
LogisticRegression(solver='liblinear')
lr_score_over=get_metrics_score(lr_over,'LogisticRegression with over sampling',X_train_over,X_test,y_train_over,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.93612 | 0.90030 | 0.91528 | 0.61066 | 0.95510 | 0.72506 | 0.93476 | 0.66296 |
The recall on test data is only 0.48 ,and model is overfitting there is lot of discrepancy between test score and train score. let try regularization
What is Regularization ?
Linear regression algorithm works by selecting coefficients for each independent variable that minimizes a loss function. However, if the coefficients are large, they can lead to over-fitting on the training dataset, and such a model will not generalize well on the unseen test data.This is where regularization helps. Regularization is the process which regularizes or shrinks the coefficients towards zero. In simple words, regularization discourages learning a more complex or flexible model, to prevent overfitting.
Main Regularization Techniques
Ridge Regression (L2 Regularization)
Ridge regression adds “squared magnitude” of coefficient as penalty term to the loss function.
Lasso Regression (L1 Regularizaion)
Lasso adds "absolute values of magnitude of coefficient as penalty term to the loss function
Elastic Net Regression
Elastic net regression combines the properties of ridge and lasso regression. It works by penalizing the model using both the 1l2-norm1 and the 1l1-norm1.
Elastic Net Formula: Ridge + Lasso
# Choose the type of classifier.
pipe_lr_reg = make_pipeline( StandardScaler(), (LogisticRegression(random_state=1)))
# Grid of parameters to choose from
parameters = {'logisticregression__C': np.arange(0.007,0.5,0.01),
'logisticregression__solver' : ['liblinear','newton-cg','lbfgs','sag','saga'],
'logisticregression__penalty': ['l1','l2']
}
# Run the grid search
grid_obj = RandomizedSearchCV(pipe_lr_reg, parameters, scoring='recall',n_jobs=-1)
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
pipe_lr_reg = grid_obj.best_estimator_
# Fit the best algorithm to the data.
pipe_lr_reg.fit(X_train_over, y_train_over)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression',
LogisticRegression(C=0.017, random_state=1))])
lr_score_under=get_metrics_score(pipe_lr_reg,'LogisticRegression with Regularization on Over sampling',X_train_over,X_test,y_train_over,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.94175 | 0.90293 | 0.91898 | 0.60656 | 0.96284 | 0.74185 | 0.94040 | 0.66742 |
Undersampling
Let see try undersampling and see if performance is different.
rus = RandomUnderSampler(random_state = 1) # Undersample dependent variable
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
#Undersample to balance classes
print("Before Under Sampling, counts of label 'Attrited': {}".format(sum(y_train==1)))
print("Before Under Sampling, counts of label 'Existing': {} \n".format(sum(y_train==0)))
print("After Under Sampling, counts of label 'Attrited': {}".format(sum(y_train_under==1)))
print("After Under Sampling, counts of label 'Existing': {} \n".format(sum(y_train_under==0)))
print('After Under Sampling, the shape of train_X: {}'.format(X_train_under.shape))
print('After Under Sampling, the shape of train_y: {} \n'.format(y_train_under.shape))
Before Under Sampling, counts of label 'Attrited': 1139 Before Under Sampling, counts of label 'Existing': 5949 After Under Sampling, counts of label 'Attrited': 1139 After Under Sampling, counts of label 'Existing': 1139 After Under Sampling, the shape of train_X: (2278, 46) After Under Sampling, the shape of train_y: (2278,)
# Initialize model using pipeline
pipe_lr_under = make_pipeline( StandardScaler(), (LogisticRegression(random_state=1)))
# Training the basic logistic regression model with training set
pipe_lr_under.fit(X_train_under,y_train_under)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression(random_state=1))])
#Evaluate the model performance by using KFold and cross_val_score
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_under=cross_val_score(estimator=pipe_lr_under, X=X_train_under, y=y_train_under, scoring=scoring, cv=kfold)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
lr_score_under=get_metrics_score(pipe_lr_under,'LogisticRegression with under sampling',X_train_under,X_test,y_train_under,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.85777 | 0.84962 | 0.85777 | 0.84016 | 0.85777 | 0.51965 | 0.85777 | 0.64213 |
Observation
Model after undersampling is generalized well on training and test set . Our recall after undersampling on test was better than our recall after oversampling on test.Let try regularization and see. Trying to use all the solver and different penality
# Choose the type of classifier.
pipe_lr_reg_under = make_pipeline( StandardScaler(), (LogisticRegression(random_state=1)))
# Grid of parameters to choose from
parameters = {'logisticregression__C': np.arange(0.007,0.5,0.01),
'logisticregression__solver' : ['liblinear','newton-cg','lbfgs','sag','saga'],
'logisticregression__penalty': ['l1','l2']
}
# Run the grid search
grid_obj = RandomizedSearchCV(pipe_lr_reg_under, parameters, scoring='recall',n_jobs=-1)
grid_obj = grid_obj.fit(X_train_under, y_train_under)
# Set the clf to the best combination of parameters
pipe_lr_reg_under = grid_obj.best_estimator_
# Fit the best algorithm to the data.
pipe_lr_reg_under.fit(X_train_under, y_train_under)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression',
LogisticRegression(C=0.31700000000000006, random_state=1,
solver='sag'))])
lr_score_reg=get_metrics_score(pipe_lr_reg_under,'LogisticRegression with Regularization on Undersampled',X_train_under,X_test,y_train_under,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.85558 | 0.84765 | 0.85601 | 0.84016 | 0.85526 | 0.51572 | 0.85564 | 0.63913 |
comparison_frame = pd.DataFrame({'Model':model_name,
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test,
'Train_F1':f1_train,
'Test_F1':f1_test })
#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1 | Test_F1 | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | LogisticRegression with under sampling | 0.85777 | 0.84962 | 0.85777 | 0.84016 | 0.85777 | 0.51965 | 0.85777 | 0.64213 |
| 4 | LogisticRegression with Regularization on Undersampled | 0.85558 | 0.84765 | 0.85601 | 0.84016 | 0.85526 | 0.51572 | 0.85564 | 0.63913 |
| 0 | LogisticRegression | 0.91366 | 0.91214 | 0.62687 | 0.63730 | 0.79245 | 0.77556 | 0.70000 | 0.69966 |
| 1 | LogisticRegression with over sampling | 0.93612 | 0.90030 | 0.91528 | 0.61066 | 0.95510 | 0.72506 | 0.93476 | 0.66296 |
| 2 | LogisticRegression with Regularization on Over sampling | 0.94175 | 0.90293 | 0.91898 | 0.60656 | 0.96284 | 0.74185 | 0.94040 | 0.66742 |
Logistic Regression with Under sampling is giving a generalized model and best recall with 0.857.
Here I am building different models using KFold and cross_val_score with pipelines and will tune the best model 3 models using GridSearchCV and RandomizedSearchCV
Stratified K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive folds (without shuffling by default) keeping the distribution of both classes in each fold the same as the target variable. Each fold is then used once as validation while the k - 1 remaining folds form the training set.
models = [] # Empty list to store all the models
# Appending pipelines for each model into the list
models.append(
(
"DTREE",
Pipeline(
steps=[
("scaler", StandardScaler()),
("decision_tree", DecisionTreeClassifier(random_state=1)),
]
),
)
)
models.append(
(
"RF",
Pipeline(
steps=[
("scaler", StandardScaler()),
("random_forest", RandomForestClassifier(random_state=1)),
]
),
)
)
models.append(
(
"BG",
Pipeline(
steps=[
("scaler", StandardScaler()),
("bagging", BaggingClassifier(random_state=1)),
]
),
)
)
models.append(
(
"GBM",
Pipeline(
steps=[
("scaler", StandardScaler()),
("gradient_boosting", GradientBoostingClassifier(random_state=1)),
]
),
)
)
models.append(
(
"ADB",
Pipeline(
steps=[
("scaler", StandardScaler()),
("adaboost", AdaBoostClassifier(random_state=1)),
]
),
)
)
models.append(
(
"XGB",
Pipeline(
steps=[
("scaler", StandardScaler()),
("xgboost", XGBClassifier(random_state=1,eval_metric='logloss')),
]
),
)
)
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
DTREE: 77.78537754076822 RF: 72.78112682587526 BG: 79.54246850606694 GBM: 82.61341680191669 ADB: 80.77053868150553 XGB: 85.77672153953164
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Random Search . Define a search space as a bounded domain of hyperparameter values and randomly sample points in that domain.
Grid Search Define a search space as a grid of hyperparameter values and evaluate every position in the grid.
%%time
# Creating pipeline
pipe_ada_grid = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
pipe_ada_grid = GridSearchCV(estimator=pipe_ada_grid, param_grid=param_grid, scoring=scorer, cv=5,n_jobs = -1)
# Fitting parameters with undersampled train data in GridSeachCV
pipe_ada_grid.fit(X_train, y_train)
print("Best parameters are {} with CV score={}:" .format(pipe_ada_grid.best_params_,pipe_ada_grid.best_score_))
Best parameters are {'adaboostclassifier__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1), 'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__n_estimators': 100} with CV score=0.8683205811886545:
Wall time: 2min 1s
# Creating new pipeline with best parameters
abc_tuned_grid = make_pipeline(
StandardScaler(),AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=70))
# Fit the model on undersampled training data
abc_tuned_grid.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('adaboostclassifier',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=70))])
abc_tuned_score=get_metrics_score(abc_tuned_grid,' Adaboost with Grid Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.98476 | 0.96644 | 0.93766 | 0.88730 | 0.96652 | 0.90208 | 0.95187 | 0.89463 |
feature_names = X_train.columns
importances = abc_tuned_grid[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe_ada_ran = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
abc_rand_cv = RandomizedSearchCV(estimator=pipe_ada_ran, param_distributions=param_grid, n_iter=10,n_jobs = -1, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
abc_rand_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(abc_rand_cv.best_params_,abc_rand_cv.best_score_))
Best parameters are {'adaboostclassifier__n_estimators': 90, 'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1)} with CV score=0.865677409382487:
Wall time: 8.19 s
# Creating new pipeline with best parameters
abc_tuned_rand = make_pipeline(
StandardScaler(),AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=90))
# Fit the model on training data
abc_tuned_rand.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('adaboostclassifier',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=90))])
abc_rand_tuned_score=get_metrics_score(abc_tuned_rand,' Adaboost with Random Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.99196 | 0.96512 | 0.96752 | 0.88115 | 0.98217 | 0.89958 | 0.97479 | 0.89027 |
feature_names = X_train.columns
importances = abc_tuned_rand[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe_gb_grid = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1))
# Grid of parameters to choose from
param_grid = {'gradientboostingclassifier__n_estimators':[100,200],
'gradientboostingclassifier__max_depth':[10,20],
'gradientboostingclassifier__min_samples_leaf': [10,20],
'gradientboostingclassifier__min_samples_split': [25,35]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_cv = GridSearchCV(pipe_gb_grid, param_grid, scoring=scorer,cv=5,n_jobs = -1)
# Fitting parameters in GridSeachCV
pipe_gb_grid = grid_cv.fit(X_train, y_train)
print("Best parameters are {} with CV score={}:" .format(pipe_gb_grid.best_params_,grid_cv.best_score_))
Best parameters are {'gradientboostingclassifier__max_depth': 10, 'gradientboostingclassifier__min_samples_leaf': 20, 'gradientboostingclassifier__min_samples_split': 25, 'gradientboostingclassifier__n_estimators': 100} with CV score=0.8656812736687535:
Wall time: 2min 36s
# Creating new pipeline with best parameters
gb_tuned_grid = make_pipeline(
StandardScaler(),GradientBoostingClassifier(max_depth=20,
min_samples_leaf=20,
min_samples_split=25,
n_estimators=200, random_state=1
))
# Fit the model on training data
gb_tuned_grid.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('gradientboostingclassifier',
GradientBoostingClassifier(max_depth=20, min_samples_leaf=20,
min_samples_split=25,
n_estimators=200,
random_state=1))])
gb_tuned_score=get_metrics_score(gb_tuned_grid,' Gradient with Grid Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 1.00000 | 0.97269 | 1.00000 | 0.89754 | 1.00000 | 0.92994 | 1.00000 | 0.91345 |
feature_names = X_train.columns
importances = gb_tuned_grid[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
pipe_gb_rand = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1))
param_grid = {'gradientboostingclassifier__n_estimators':[100,200],
'gradientboostingclassifier__max_depth':[10,20],
'gradientboostingclassifier__min_samples_leaf': [10,20],
'gradientboostingclassifier__min_samples_split': [25,35]
}
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
pipe_gb_rand = RandomizedSearchCV(estimator=pipe_gb_rand, param_distributions=param_grid,n_jobs = -1, n_iter=10, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
pipe_gb_rand.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(pipe_gb_rand.best_params_,pipe_gb_rand.best_score_))
Best parameters are {'gradientboostingclassifier__n_estimators': 100, 'gradientboostingclassifier__min_samples_split': 35, 'gradientboostingclassifier__min_samples_leaf': 20, 'gradientboostingclassifier__max_depth': 10} with CV score=0.8656812736687535:
Wall time: 1min 16s
gb_tuned_rand = make_pipeline(
StandardScaler(),GradientBoostingClassifier(max_depth=20, min_samples_leaf=20,
min_samples_split=25,
n_estimators=200,
random_state=1))
# Fit the model on training data
gb_tuned_rand.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('gradientboostingclassifier',
GradientBoostingClassifier(max_depth=20, min_samples_leaf=20,
min_samples_split=25,
n_estimators=200,
random_state=1))])
gb_rand_tuned_score=get_metrics_score(gb_tuned_rand,' Gradient boosting with Random Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 1.00000 | 0.97269 | 1.00000 | 0.89754 | 1.00000 | 0.92994 | 1.00000 | 0.91345 |
feature_names = X_train.columns
importances = gb_tuned_rand[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
#Creating pipeline
pipe_xgboost=make_pipeline(StandardScaler(), XGBClassifier(random_state=1,eval_metric='logloss'))
#Parameter grid to pass in GridSearchCV
param_grid={'xgbclassifier__n_estimators':np.arange(50,300,50),'xgbclassifier__scale_pos_weight':[2,10],
'xgbclassifier__learning_rate':[0.01,0.1,0.2],
'xgbclassifier__subsample':[0.7,1]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
Xgboost_grid_cv = GridSearchCV(estimator=pipe_xgboost, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
#Fitting parameters in GridSeachCV
Xgboost_grid_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(Xgboost_grid_cv.best_params_,Xgboost_grid_cv.best_score_))
Best parameters are {'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__scale_pos_weight': 10, 'xgbclassifier__subsample': 1} with CV score=0.9482108354586908:
Wall time: 3min 3s
# Creating new pipeline with best parameters
xgb_tuned_grid = make_pipeline(
StandardScaler(),
XGBClassifier(
random_state=1,
n_estimators=150,
scale_pos_weight=10,
subsample=1,
learning_rate=0.01,
eval_metric='logloss',
),
)
# Fit the model on training data
xgb_tuned_grid.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('xgbclassifier',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, enable_categorical=False,
eval_metric='logloss', gamma=0, gpu_id=-1,
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_delta_step=0,
max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=150,
n_jobs=8, num_parallel_tree=1, predictor='auto',
random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=10, subsample=1,
tree_method='exact', validate_parameters=1,
verbosity=None))])
xgb_tuned_score_grid=get_metrics_score(xgb_tuned_grid,' XGboost with Grid Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.93030 | 0.90326 | 0.99561 | 0.95492 | 0.69871 | 0.63144 | 0.82114 | 0.76020 |
%%time
#Creating pipeline
pipe_xgboost_ran=make_pipeline(StandardScaler(), XGBClassifier(random_state=1,eval_metric='logloss'))
#Parameter grid to pass in random
param_grid={'xgbclassifier__n_estimators':np.arange(50,300,50),'xgbclassifier__scale_pos_weight':[2,10],
'xgbclassifier__learning_rate':[0.01,0.1,0.2],
'xgbclassifier__subsample':[0.7,1]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe_xgboost_ran, param_distributions=param_grid,n_jobs = -1, n_iter=10, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'xgbclassifier__subsample': 0.7, 'xgbclassifier__scale_pos_weight': 10, 'xgbclassifier__n_estimators': 50, 'xgbclassifier__learning_rate': 0.01} with CV score=0.9429283561326223:
Wall time: 25.6 s
# Creating new pipeline with best parameters
xgb_rand = make_pipeline(
StandardScaler(),
XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
subsample=0.7,
learning_rate=0.01,
eval_metric='logloss',
),
)
# Fit the model on training data
xgb_rand.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('xgbclassifier',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, enable_categorical=False,
eval_metric='logloss', gamma=0, gpu_id=-1,
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_delta_step=0,
max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50,
n_jobs=8, num_parallel_tree=1, predictor='auto',
random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=10, subsample=0.7,
tree_method='exact', validate_parameters=1,
verbosity=None))])
randomized_cv_tuned_score=get_metrics_score(randomized_cv,'XG boosting with Random Search',X_train,X_test,y_train,y_test)
| Metric | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1-Score | Test_F1-Score |
|---|---|---|---|---|---|---|---|---|
| Score | 0.91168 | 0.89141 | 0.98946 | 0.94877 | 0.64733 | 0.60286 | 0.78264 | 0.73726 |
comparison_frame = pd.DataFrame({'Model':model_name,
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test,
'Train_F1':f1_train,
'Test_F1':f1_test })
#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | Train_F1 | Test_F1 | |
|---|---|---|---|---|---|---|---|---|---|
| 9 | XGboost with Grid Search | 0.93030 | 0.90326 | 0.99561 | 0.95492 | 0.69871 | 0.63144 | 0.82114 | 0.76020 |
| 10 | XG boosting with Random Search | 0.91168 | 0.89141 | 0.98946 | 0.94877 | 0.64733 | 0.60286 | 0.78264 | 0.73726 |
| 7 | Gradient with Grid Search | 1.00000 | 0.97269 | 1.00000 | 0.89754 | 1.00000 | 0.92994 | 1.00000 | 0.91345 |
| 8 | Gradient boosting with Random Search | 1.00000 | 0.97269 | 1.00000 | 0.89754 | 1.00000 | 0.92994 | 1.00000 | 0.91345 |
| 5 | Adaboost with Grid Search | 0.98476 | 0.96644 | 0.93766 | 0.88730 | 0.96652 | 0.90208 | 0.95187 | 0.89463 |
| 6 | Adaboost with Random Search | 0.99196 | 0.96512 | 0.96752 | 0.88115 | 0.98217 | 0.89958 | 0.97479 | 0.89027 |
| 3 | LogisticRegression with under sampling | 0.85777 | 0.84962 | 0.85777 | 0.84016 | 0.85777 | 0.51965 | 0.85777 | 0.64213 |
| 4 | LogisticRegression with Regularization on Undersampled | 0.85558 | 0.84765 | 0.85601 | 0.84016 | 0.85526 | 0.51572 | 0.85564 | 0.63913 |
| 0 | LogisticRegression | 0.91366 | 0.91214 | 0.62687 | 0.63730 | 0.79245 | 0.77556 | 0.70000 | 0.69966 |
| 1 | LogisticRegression with over sampling | 0.93612 | 0.90030 | 0.91528 | 0.61066 | 0.95510 | 0.72506 | 0.93476 | 0.66296 |
| 2 | LogisticRegression with Regularization on Over sampling | 0.94175 | 0.90293 | 0.91898 | 0.60656 | 0.96284 | 0.74185 | 0.94040 | 0.66742 |
feature_names = X_train.columns
importances = xgb_tuned_grid[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()